* Reset settings and initialize log file
launch, path("build/atus_builder")

*-------------------------------------------------------------------------------
* Price and Wasserman (2024), "The Summer Drop in Female Employment"
*
* Description: Prepare data from the American Time Use Survey.
*-------------------------------------------------------------------------------


* Process person-level data
*-------------------------------------------------------------------------------

* Load the person-level ATUS extract
gzuse "$basepath/data/raw/atus/$atus_person.dta.gz", clear

* Verify data structure
gisid caseid

* Start from the second year of data, when more variables are populated
keep if year >= 2004
quietly sum year
assert r(min) == 2004 & r(max) == 2019

* Extract calendar year/month
gen int tm = ym(year, month)
format %tm tm

* Label months
capture label drop month_lbl
forvalues m = 1/12 {
	local mlbl: word `m' of `=c(Months)'
	label define month_lbl `m' "`mlbl'", add
}
label values month month_lbl

* Label days of the week
capture label drop day_lbl
forvalues i = 1/7 {
	local ilbl: word `i' of `=c(Weekdays)'
	label define day_lbl `i' "`ilbl'", add
}
label values day day_lbl

* Restrict to prime-age individuals
keep if inrange(age, 25, 49)

* Drop interviews flagged as unsuitable for use
keep if dataqual == 200

* Express time in "hours per week"
foreach v of varlist act_* scc_* *_childcare {
	recast double `v'
	quietly replace `v' = 7/60 * `v'
}

* Compute total time accounted for
gen double tottime = 0
foreach v of varlist act_* {
	quietly replace tottime = tottime + `v'
}

replace tottime = 168 if abs(tottime - 168) < .00001

* Discard interviews with incomplete diaries
drop if tottime < 168

* Recode sex
gen byte female = (sex == 2)

* Compute total primary childcare
rename *_childcare primary_*
gen double primary = primary_basic + primary_educ + primary_rec + primary_travel

* Define secondary childcare as that spent on own household children
rename scc_ownhh secondary

* Define total childcare as primary plus secondary
gen double childcare = primary + secondary
assert childcare <= 168

* Summarize number and ages of own household children
gen byte kidund6 = (kidund1 | kid1to2 | kid3to5)
assert kidund18 == (kidund6 | kid6to12 | kid13to17)

* Bin parents based on the age of their youngest child
gen byte youngest = .
replace youngest = 0 if kidund18 == 0
replace youngest = 1 if missing(youngest) & kidund6 == 1
replace youngest = 2 if missing(youngest) & kid6to12 == 1
replace youngest = 3 if missing(youngest) & kid13to17 == 1
assert !missing(youngest)

label define youngest_lbl 0 "No child < 18", replace
label define youngest_lbl 1 "Youngest < 6", add
label define youngest_lbl 2 "Youngest 6-12", add
label define youngest_lbl 3 "Youngest 13-17", add
label values youngest youngest_lbl

* Create a linear spline in calendar time with knots at key turning points
#delimit ;
mkspline
	tmspline1 `=ym(2007, 01)'
	tmspline2 `=ym(2010, 06)'
	tmspline3 `=ym(2014, 10)'
	tmspline4 = tm,
displayknots;
#delimit cr

* Restrict to variables we need
local keeplist caseid year month day tm tmspline* wt06 female age youngest childcare primary primary_* secondary
keep `keeplist'
order `keeplist'

* Label variables
label variable caseid         "Case ID"
label variable year           "Year"
label variable month          "Month"
label variable day            "Day of the week"
label variable tm             "Year/month"
label variable tmspline1      "Linear spline in calendar time"
label variable tmspline2      "Linear spline in calendar time"
label variable tmspline3      "Linear spline in calendar time"
label variable tmspline4      "Linear spline in calendar time"
label variable wt06           "Weight"
label variable female         "Female"
label variable age            "Age"
label variable youngest       "Binned age of youngest own child in the household"
label variable childcare      "Time spent on childcare (including secondary)"
label variable primary        "Time spent on primary childcare"
label variable secondary      "Time spent on secondary childcare"
label variable primary_basic  "Time spent on basic childcare"
label variable primary_educ   "Time spent on educational childcare"
label variable primary_rec    "Time spent on recreational childcare"
label variable primary_travel "Time spent on travel childcare"

* Drop unused labels
quietly label dir
foreach l in `=r(names)' {
	if !regexm("`l'", "_lbl$") label drop `l'
}

* Stash for merging
tempfile people
save `people'


* Prepare the activity-level source file
*-------------------------------------------------------------------------------

* Convert the dataset from .dat format to .dta format
capture confirm file "$basepath/data/raw/atus/$atus_activity.dta"
if _rc != 0 {
	local pwd = c(pwd)
	cd "$basepath/data/raw/atus"

	capture confirm file "$basepath/data/raw/atus/$atus_activity.dat"
	if _rc != 0 {
		shell gunzip "$basepath/data/raw/atus/$atus_activity.dat.gz"
	}

	quietly do "$basepath/data/raw/atus/$atus_activity.do"
	compress
	save "$basepath/data/raw/atus/$atus_activity.dta", replace
	cd `pwd'
}


* Process activity-level data
*-------------------------------------------------------------------------------

* Load the activity-level ATUS extract
use "$basepath/data/raw/atus/$atus_activity.dta", clear
keep if rectype == 3

* Verify data structure
gisid caseid actline

* Restrict to relevant variables
keep caseid actline activity duration scc_ownhh_ln start stop

* Tag time intervals with unknown activities
gen byte gap = inrange(activity, 500000, 509999)

* Measure total time diaried, including and excluding gaps
bysort caseid (actline): egen double tottime_check = total(duration)
bysort caseid (actline): egen double tottime_nogaps = total(duration * (gap == 0))

* Verify that time intervals sum to 24 hours
assert tottime_check == 24 * 60

* Restrict to observations in our sample (merging in select variables)
merge m:1 caseid using `people', assert(1 3) keep(3) keepusing(year month day) nogenerate

* Verify that all time intervals are now accounted for
assert tottime_nogaps * 7/60 == 168
drop tottime*

* Flag the last activity of the diary day
bysort caseid (actline): gen byte last = (_n == _N)

* Truncate the last activity at 4am
replace stop = "04:00:00" if last == 1

* Record the date of the observation (some activities occur between midnight and 4am the next morning)
gen td = mdy(month, day, year)
gen td0 = td + (real(substr(start, 1, 2)) < 04)
gen td1 = td + (real(substr(stop, 1, 2)) < 04 | last == 1)

* Express start/end times in Stata time format
gen double tc0 = clock(string(td0, "%td") + " " + start, "DMYhms")
gen double tc1 = clock(string(td1, "%td") + " " +  stop, "DMYhms")
format %tc tc0 tc1

* Verify time structure
assert tc1 > tc0
assert duration == (tc1 - tc0)/60/1000
bysort caseid (actline): assert tc0 >= tc0[_n - 1] if _n > 1

* Drop gratuitous time variables
drop year month day td start stop

* Create indicators for primary childcare activities
gen basic_activity  = inlist(activity, 030101, 030106, 030107, 030108, 030109, 030111, 030199, 030301, 030302, 030303, 030399)
gen educ_activity   = inlist(activity, 030102, 030201, 030202, 030203, 030204, 030299)
gen rec_activity    = inlist(activity, 030103, 030104, 030105, 030110)
gen travel_activity = inlist(activity, 030112, 180301, 180302, 180303, 180304)

* Secondary childcare is sometimes coded as only a portion of the full time interval
gen scc_ratio = scc_ownhh_ln/duration
assert inrange(scc_ratio, 0, 1)
sum scc_ratio if scc_ratio > 0, detail
gen byte secondary_activity = (scc_ownhh_ln > 0)
drop scc_ratio

* Verify that primary and secondary childcare are disjoint (with one anomaly)
assert secondary_activity == 0 if basic_activity == 1
assert secondary_activity == 0 if educ_activity == 1
assert secondary_activity == 0 if rec_activity == 1

count if secondary_activity == 1 & travel_activity == 1
assert r(N)
gen byte anomaly = (secondary_activity == 1 & travel_activity == 1)
replace secondary_activity = 0 if anomaly == 1

* Compute time spent on primary childcare activities, in hours per week
foreach v in basic educ rec travel {
	gen primary_`v' = 7/60 * `v'_activity * duration
}

* Compute time spent on secondary childcare activities
gen secondary = 7/60 * secondary_activity * scc_ownhh_ln
drop secondary_activity

* Compute time spent on each primary activity that coincides with secondary childcare
gen major_activity = floor(activity/10000)

* Tag secondary childcare alongside household activities
gen secondary_hhact = secondary if major_activity == 2

* Tag secondary childcare alongside social, sports, or travel activities
gen secondary_leisure = secondary if inlist(major_activity, 12, 13, 18)

* Tag secondary childcare alongside all other activities
gen secondary_other = secondary if !inlist(major_activity, 2, 12, 13, 18)

* Aggregate to the person level
gcollapse (sum) primary_* secondary_* (max) anomaly, by(caseid)
rename primary_* primary_*_check

* Stash for merging
tempfile activities
save `activities'


* Combine data derived from the person- and activity-level extracts
*-------------------------------------------------------------------------------

* Merge the two datasets
use `people', clear
merge 1:1 caseid using `activities', assert(3) nogenerate

* Verify that total time agrees between the person- and activity-level files
assert abs(primary_basic - primary_basic_check) < .0001
assert abs(primary_educ - primary_educ_check) < .0001
assert abs(primary_rec - primary_rec_check) < .0001
assert abs(primary_travel - primary_travel_check) < .0001
drop primary_*_check

* Verify approximate addivity and then impose exact addivity
assert childcare == primary + secondary if anomaly == 0
assert primary == primary_basic + primary_educ + primary_rec + primary_travel
assert abs(secondary - secondary_hhact - secondary_leisure - secondary_other) < .0001 if anomaly == 0
replace secondary = secondary_hhact + secondary_leisure + secondary_other
replace childcare = primary + secondary
drop anomaly


* Finalize the dataset
*-------------------------------------------------------------------------------

* Label variables
label variable secondary_hhact   "Time spent on secondary childcare alongside household activities"
label variable secondary_leisure "Time spent on secondary childcare alongside leisure activities"
label variable secondary_other   "Time spent on secondary childcare alongside other activities"

* Save the dataset
sort caseid
tsset caseid tm
compress
save "$basepath/data/derived/atus.dta", replace

* Close the log file
unlaunch
